library(ggplot2)
library(stringi)
library(gridExtra)
library(dendextend)
library(kableExtra)
library(limma)
library(psych)
library(tidyverse)
library(CONSTANd)  # install from source: https://github.com/PDiracDelta/CONSTANd/
library(matrixTests)

This notebook presents isobaric labeling data analysis strategy that includes data-driven normalization.

We will check how varying analysis components [summarization/normalization/differential abundance testing methods] changes end results of a quantitative proteomic study.

source('./other_functions.R')
source('./plotting_functions.R')

# you should either make a symbolic link in this directory
data.list <- readRDS('input_data.rds')
dat.l <- data.list$dat.l # data in long format
# dat.w <- data.list$dat.w # data in wide format
if ('X' %in% colnames(dat.l)) { dat.l$X <- NULL }

# remove shared peptides
shared.peptides <- dat.l %>% filter(!shared.peptide)

# keep spectra with isolation interference <30 and no missing quantification channels
dat.l <- dat.l %>% filter(isoInterOk & noNAs)

# which proteins were spiked in?
spiked.proteins <- dat.l %>% distinct(Protein) %>% filter(stri_detect(Protein, fixed='ups')) %>% pull %>% as.character

# which peptides were identified in each MS run?
unique.pep=dat.l %>% 
  group_by(Run) %>%
  distinct(Peptide) %>% 
  mutate(val=1)
unique.pep <- xtabs(val~Peptide+Run, data=unique.pep)
tmp <- apply(unique.pep, 1, function(x) all(x==1))
inner.peptides <- rownames(unique.pep)[tmp]
# specify # of varying component variants and their names
variant.names <- c('LIMMA', 'Wilcoxon', 'permutation_test')
n.comp.variants <- length(variant.names)
scale.vec <- c('log', 'log', 'log') 
# pick reference channel and condition for making plots / doing DEA
referenceChannel <- '127C'
referenceCondition <- '0.5'
# specify colours corresponding to biological conditions
condition.colour <- tribble(
  ~Condition, ~Colour,
  "0.125", 'black',
  "0.5", 'blue',
  "0.667", 'green',
  "1", 'red' )
# create data frame with sample info (distinct Run,Channel, Sample, Condition, Colour)
sample.info <- get_sample_info(dat.l, condition.colour)
channelNames <- remove_factors(unique(sample.info$Channel))
# set seed for permutation test
seed=9987

1 Unit component: log2 transformation of reporter ion intensities

dat.unit.l <- dat.l
dat.unit.l <- dat.l %>% mutate(response=log2(Intensity)) %>% select(-Intensity)

2 Normalization component: medianSweeping (1)

# switch to wide format
dat.unit.w <- pivot_wider(data = dat.unit.l, id_cols=-one_of(c('Condition', 'BioReplicate')), names_from=Channel, values_from=response) 
# subtract the spectrum median log2intensity from the observed log2intensities
dat.norm.w <- dat.unit.w
dat.norm.w[,channelNames] <- sweep(dat.norm.w[,channelNames], 1, apply(dat.norm.w[,channelNames], 1, median) )

3 Summarization component: Median summarization

Summarize quantification values from PSM to peptide (first step) to protein (second step).

# normalized data
# group by (run,)protein,peptide then summarize twice (once on each level)
dat.norm.summ.w <- dat.norm.w %>% group_by(Run, Protein, Peptide) %>% summarise_at(.vars = channelNames, .funs = median) %>% summarise_at(.vars = channelNames, .funs = median) %>% ungroup() 

Notice that the row sums are not equal to Ncols anymore, because the median summarization does not preserve them (but mean summarization does).

Let’s also summarize the non-normalized data for comparison in the next section.

# non-normalized data
# add select() statement because summarise_at is going bananas over character columns
dat.nonnorm.summ.w <- dat.unit.w %>% group_by(Run, Protein, Peptide) %>% select(Run, Protein, Peptide, channelNames) %>% summarise_at(.vars = channelNames, .funs = median) %>% select(Run, Protein, channelNames) %>% summarise_at(.vars = channelNames, .funs = median) %>% ungroup()

4 Normalization component: medianSweeping (2)

# medianSweeping: in each channel, subtract median computed across all proteins within the channel
# do the above separately for each MS run
x.split <- split(dat.norm.summ.w, dat.norm.summ.w$Run)  
x.split.norm  <- lapply(x.split, function(y) {
  y[,channelNames] <- sweep(y[,channelNames], 2, apply(y[,channelNames], 2, median) )
  return(y)
})
dat.norm.summ.w <- bind_rows(x.split.norm)

5 QC plots

# make data completely wide (also across runs)

## non-normalized data
dat.nonnorm.summ.w2 <- dat.nonnorm.summ.w %>% pivot_wider(names_from = Run, values_from = all_of(channelNames), names_glue = "{Run}:{.value}")

## normalized data
dat.norm.summ.w2 <- dat.norm.summ.w %>% pivot_wider(names_from = Run, values_from = all_of(channelNames), names_glue = "{Run}:{.value}")

5.1 Boxplots

# use (half-)wide format
par(mfrow=c(1,2))
  boxplot_w(dat.nonnorm.summ.w,sample.info, 'Raw')
  boxplot_w(dat.norm.summ.w, sample.info, 'Normalized')

par(mfrow=c(1,1))

5.2 MA plots

MA plots of two single samples taken from condition 1 and condition 0.125, measured in different MS runs (samples Mixture2_1:127C and Mixture1_2:129N, respectively).

# different unit variants require different computation of fold changes and average abuandance: additive or multiplicative scale; see maplot_ils function 
# use wide2 format
p1 <- maplot_ils(dat.nonnorm.summ.w2, 'Mixture2_1:127C', 'Mixture1_2:129N', scale='log', 'Raw')
p2 <- maplot_ils(dat.norm.summ.w2, 'Mixture2_1:127C', 'Mixture1_2:129N', scale='log', 'Normalized')
grid.arrange(p1, p2, ncol=2)  

5.3 MA plots of all samples from condition 1 and condition 0.125 (quantification values averaged within condition).

# different unit variants require different computation of fold changes and average abuandance: additive or multiplicative scale; see maplot_ils function 
channels.num <- sample.info %>% filter(Condition=='1') %>% select(Sample) %>% pull
channels.denom <- sample.info %>% filter(Condition=='0.125') %>% select(Sample) %>% pull

p1 <- maplot_ils(dat.nonnorm.summ.w2, channels.num, channels.denom, scale='log', 'Raw')
p2 <- maplot_ils(dat.norm.summ.w2, channels.num, channels.denom, scale='log', 'Normalized')
grid.arrange(p1, p2, ncol=2)  

5.4 CV (coefficient of variation) plots

dat.nonnorm.summ.w$Mixture <- unlist(lapply(stri_split(dat.nonnorm.summ.w$Run,fixed='_'), function(x) x[1]))
dat.nonnorm.summ.l <- to_long_format(dat.nonnorm.summ.w, sample.info)

dat.norm.summ.w$Mixture <- unlist(lapply(stri_split(dat.norm.summ.w$Run,fixed='_'), function(x) x[1]))
dat.norm.summ.l <- to_long_format(dat.norm.summ.w, sample.info)

par(mfrow=c(1, 2))
  cvplot_ils(dat=dat.nonnorm.summ.l, feature.group='Protein', xaxis.group='Condition', title='Raw', abs=F)
  cvplot_ils(dat=dat.norm.summ.l, feature.group='Protein', xaxis.group='Condition', title='Normalized', abs=F)

par(mfrow=c(1, 1))  

5.5 PCA plots

5.5.1 Using all proteins

par(mfrow=c(1, 2))
  pcaplot_ils(dat.nonnorm.summ.w2 %>% select(-'Protein'), info=sample.info, 'Raw')
  pcaplot_ils(dat.norm.summ.w2 %>% select(-'Protein'), info=sample.info, 'Normalized')

par(mfrow=c(1, 1))  

5.5.2 Using spiked proteins only

par(mfrow=c(1, 2))
  pcaplot_ils(dat.nonnorm.summ.w2 %>% filter(Protein %in% spiked.proteins) %>% select(-'Protein'), info=sample.info, 'Raw')
  pcaplot_ils(dat.norm.summ.w2 %>% filter(Protein %in% spiked.proteins) %>% select(-'Protein'), info=sample.info, 'Normalized')

par(mfrow=c(1, 1))  

5.6 HC (hierarchical clustering) plots

Only use spiked proteins

par(mfrow=c(1, 2))
  dendrogram_ils(dat.nonnorm.summ.w2 %>% filter(Protein %in% spiked.proteins) %>% select(-Protein), info=sample.info, 'Raw')
  dendrogram_ils(dat.norm.summ.w2 %>% filter(Protein %in% spiked.proteins) %>% select(-Protein), info=sample.info, 'Normalized')

par(mfrow=c(1, 1))  

6 DEA component

6.1 Moderated t-test

TODO: - Also try to log-transform the intensity case, to see if there are large differences in the t-test results. - done. remove this code? NOTE: - actually, lmFit (used in moderated_ttest) was built for log2-transformed data. However, supplying untransformed intensities can also work. This just means that the effects in the linear model are also additive on the untransformed scale, whereas for log-transformed data they are multiplicative on the untransformed scale. Also, there may be a bias which occurs from biased estimates of the population means in the t-tests, as mean(X) is not equal to exp(mean(log(X))).

design.matrix <- get_design_matrix(referenceCondition, sample.info)
dat.dea <- emptyList(variant.names)
this_scale <- scale.vec[1]
d <- column_to_rownames(as.data.frame(dat.norm.summ.w2), 'Protein')
dat.dea$LIMMA <- moderated_ttest(dat=d, design.matrix, scale=this_scale) 

6.2 Wilcoxon test

otherConditions <- dat.l %>% distinct(Condition) %>% pull(Condition) %>% as.character %>% sort
otherConditions <- otherConditions[-match(referenceCondition, otherConditions)]
dat.dea$Wilcoxon <- wilcoxon_test(dat.norm.summ.w2, sample.info, referenceCondition, otherConditions, logFC.method='ratio')

6.3 Permutation test

dat.dea$permutation_test<- permutation_test(dat.norm.summ.w2, sample.info, referenceCondition, otherConditions, B=1000, seed=seed)

7 Results comparison

7.1 Confusion matrix

cm <- conf_mat(dat.dea, 'q.mod', 0.05, spiked.proteins)
print_conf_mat(cm)
0.125
LIMMA
Wilcoxon
permutation_test
background spiked background spiked background spiked
not_DE 4061 4 4064 19 4064 19
DE 3 15 0 0 0 0
0.125
LIMMA Wilcoxon permutation_test
Accuracy 0.998 0.995 0.995
Sensitivity 0.789 0.000 0.000
Specificity 0.999 1.000 1.000
PPV 0.833 NaN NaN
NPV 0.999 0.995 0.995
0.667
LIMMA
Wilcoxon
permutation_test
background spiked background spiked background spiked
not_DE 4062 17 4064 19 4064 19
DE 2 2 0 0 0 0
0.667
LIMMA Wilcoxon permutation_test
Accuracy 0.995 0.995 0.995
Sensitivity 0.105 0.000 0.000
Specificity 1.000 1.000 1.000
PPV 0.500 NaN NaN
NPV 0.996 0.995 0.995
1
LIMMA
Wilcoxon
permutation_test
background spiked background spiked background spiked
not_DE 4060 5 4064 19 4064 19
DE 4 14 0 0 0 0
1
LIMMA Wilcoxon permutation_test
Accuracy 0.998 0.995 0.995
Sensitivity 0.737 0.000 0.000
Specificity 0.999 1.000 1.000
PPV 0.778 NaN NaN
NPV 0.999 0.995 0.995

7.2 Scatter plots

# character vectors containing logFC and p-values columns
dea.cols <- colnames(dat.dea[[1]])
logFC.cols <- dea.cols[stri_detect_fixed(dea.cols, 'logFC')]
q.cols <- dea.cols[stri_detect_fixed(dea.cols, 'q.mod')]
n.contrasts <- length(logFC.cols)

scatterplot_ils(dat.dea, q.cols, 'q-values')

scatterplot_ils(dat.dea, logFC.cols, 'log2FC')

7.3 Volcano plots

for (i in 1:n.contrasts){
  volcanoplot_ils(dat.dea, i, spiked.proteins) }

7.4 Violin plots

Let’s see whether the spiked protein fold changes make sense

# plot theoretical value (horizontal lines) and violin per condition
dat.spiked.logfc <- lapply(dat.dea, function(x) x[spiked.proteins,logFC.cols])
dat.spiked.logfc.l <- lapply(dat.spiked.logfc, function(x) {
  x %>% rename_with(function(y) sapply(y, function(z) strsplit(z, '_')[[1]][2])) %>% pivot_longer(cols = everything(), names_to = 'condition', values_to = 'logFC') %>% add_column(Protein=rep(rownames(x), each=length(colnames(x)))) })
violinplot_ils(lapply(dat.spiked.logfc.l, filter, condition != referenceCondition))

8 Conclusions

9 Session information

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.5 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=de_BE.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=de_BE.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=de_BE.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=de_BE.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] matrixTests_0.1.9 CONSTANd_0.99.4   forcats_0.5.0     stringr_1.4.0    
##  [5] dplyr_1.0.2       purrr_0.3.4       readr_1.4.0       tidyr_1.1.2      
##  [9] tibble_3.0.4      tidyverse_1.3.0   psych_2.0.9       limma_3.45.19    
## [13] kableExtra_1.3.1  dendextend_1.14.0 gridExtra_2.3     stringi_1.5.3    
## [17] ggplot2_3.3.2    
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-150         matrixStats_0.57.0   fs_1.5.0            
##  [4] lubridate_1.7.9      webshot_0.5.2        httr_1.4.2          
##  [7] tools_4.0.3          backports_1.1.10     R6_2.4.1            
## [10] rpart_4.1-15         DBI_1.1.0            mgcv_1.8-33         
## [13] colorspace_1.4-1     nnet_7.3-14          withr_2.3.0         
## [16] tidyselect_1.1.0     mnormt_2.0.2         compiler_4.0.3      
## [19] cli_2.1.0            rvest_0.3.6          xml2_1.3.2          
## [22] labeling_0.4.2       scales_1.1.1         digest_0.6.27       
## [25] rmarkdown_2.5        pkgconfig_2.0.3      htmltools_0.5.0     
## [28] highr_0.8            dbplyr_1.4.4         rlang_0.4.8         
## [31] readxl_1.3.1         rstudioapi_0.11      farver_2.0.3        
## [34] generics_0.0.2       jsonlite_1.7.1       ModelMetrics_1.2.2.2
## [37] magrittr_1.5         Matrix_1.2-18        Rcpp_1.0.5          
## [40] munsell_0.5.0        fansi_0.4.1          viridis_0.5.1       
## [43] lifecycle_0.2.0      pROC_1.16.2          yaml_2.2.1          
## [46] MASS_7.3-53          plyr_1.8.6           recipes_0.1.14      
## [49] grid_4.0.3           blob_1.2.1           parallel_4.0.3      
## [52] crayon_1.3.4         lattice_0.20-41      haven_2.3.1         
## [55] splines_4.0.3        hms_0.5.3            tmvnsim_1.0-2       
## [58] knitr_1.30           pillar_1.4.6         stats4_4.0.3        
## [61] reshape2_1.4.4       codetools_0.2-16     reprex_0.3.0        
## [64] glue_1.4.2           evaluate_0.14        data.table_1.13.2   
## [67] modelr_0.1.8         vctrs_0.3.4          foreach_1.5.1       
## [70] cellranger_1.1.0     gtable_0.3.0         assertthat_0.2.1    
## [73] xfun_0.18            gower_0.2.2          prodlim_2019.11.13  
## [76] broom_0.7.2          e1071_1.7-4          class_7.3-17        
## [79] survival_3.2-7       viridisLite_0.3.0    timeDate_3043.102   
## [82] iterators_1.0.13     lava_1.6.8           ellipsis_0.3.1      
## [85] caret_6.0-86         ipred_0.9-9